import pandas as pd
import numpy as np
import seaborn as sns
import pylab as py
%pylab inline
Populating the interactive namespace from numpy and matplotlib
train=pd.read_csv('train.csv')
train
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer_id_39075 | 31 | admin. | married | university.degree | no | no | no | cellular | dec | ... | 3 | 999 | 1 | failure | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | 0 |
| 1 | customer_id_34855 | 31 | technician | single | university.degree | no | no | no | telephone | may | ... | 4 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 2 | customer_id_7107 | 47 | blue-collar | married | basic.6y | unknown | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | 0 |
| 3 | customer_id_31614 | 36 | services | married | university.degree | no | no | no | cellular | may | ... | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | 0 |
| 4 | customer_id_34878 | 34 | admin. | single | high.school | no | no | no | cellular | may | ... | 9 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28826 | customer_id_6265 | 60 | retired | married | professional.course | unknown | no | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.859 | 5191.0 | 0 |
| 28827 | customer_id_11284 | 39 | management | married | university.degree | no | no | no | telephone | jun | ... | 1 | 999 | 0 | nonexistent | 1.43 | 47.2325 | -29.26 | 5.963 | 5228.1 | 0 |
| 28828 | customer_id_38158 | 37 | admin. | married | high.school | no | yes | no | cellular | oct | ... | 1 | 4 | 1 | success | -3.37 | 46.2155 | -18.83 | 1.756 | 5017.5 | 1 |
| 28829 | customer_id_860 | 42 | management | married | university.degree | no | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.858 | 5191.0 | 0 |
| 28830 | customer_id_15795 | 31 | admin. | single | university.degree | no | yes | no | cellular | jul | ... | 2 | 999 | 0 | nonexistent | 1.43 | 46.9590 | -29.89 | 5.962 | 5228.1 | 0 |
28831 rows × 22 columns
test=pd.read_csv('test.csv')
test.sample(10)
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | duration | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5909 | customer_id_20196 | 34 | technician | married | university.degree | no | no | yes | cellular | aug | ... | 5.516667 | 5 | 999 | 0 | nonexistent | 1.43 | 46.7220 | -25.27 | 5.967 | 5228.1 |
| 5960 | customer_id_32783 | 35 | unemployed | married | basic.9y | no | yes | yes | cellular | may | ... | 0.350000 | 6 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.301 | 5099.1 |
| 766 | customer_id_5323 | 49 | services | married | professional.course | no | yes | no | telephone | may | ... | 3.766667 | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.859 | 5191.0 |
| 8331 | customer_id_2075 | 58 | housemaid | divorced | basic.4y | unknown | no | no | telephone | may | ... | 1.433333 | 1 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.859 | 5191.0 |
| 5875 | customer_id_19647 | 54 | technician | divorced | high.school | no | yes | no | cellular | aug | ... | 1.683333 | 1 | 999 | 0 | nonexistent | 1.43 | 46.7220 | -25.27 | 5.970 | 5228.1 |
| 6834 | customer_id_18843 | 60 | blue-collar | married | basic.9y | no | no | no | cellular | aug | ... | 8.300000 | 1 | 999 | 0 | nonexistent | 1.43 | 46.7220 | -25.27 | 5.972 | 5228.1 |
| 2109 | customer_id_11836 | 43 | entrepreneur | married | university.degree | unknown | yes | no | telephone | jun | ... | 3.600000 | 2 | 999 | 0 | nonexistent | 1.43 | 47.2325 | -29.26 | 5.961 | 5228.1 |
| 2450 | customer_id_2877 | 37 | technician | married | professional.course | no | no | no | telephone | may | ... | 2.683333 | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.861 | 5191.0 |
| 7542 | customer_id_8889 | 40 | services | married | high.school | unknown | no | no | telephone | jun | ... | 6.983333 | 1 | 999 | 0 | nonexistent | 1.43 | 47.2325 | -29.26 | 5.868 | 5228.1 |
| 11505 | customer_id_4842 | 31 | blue-collar | married | basic.9y | no | yes | no | telephone | may | ... | 8.200000 | 1 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.860 | 5191.0 |
10 rows × 21 columns
target=[]
for i in train.columns:
if i not in test.columns:
target.append(i)
print(target)
['subscribed']
train.head(10)
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer_id_39075 | 31 | admin. | married | university.degree | no | no | no | cellular | dec | ... | 3 | 999 | 1 | failure | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | 0 |
| 1 | customer_id_34855 | 31 | technician | single | university.degree | no | no | no | telephone | may | ... | 4 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 2 | customer_id_7107 | 47 | blue-collar | married | basic.6y | unknown | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | 0 |
| 3 | customer_id_31614 | 36 | services | married | university.degree | no | no | no | cellular | may | ... | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | 0 |
| 4 | customer_id_34878 | 34 | admin. | single | high.school | no | no | no | cellular | may | ... | 9 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 5 | customer_id_24606 | 48 | entrepreneur | married | university.degree | no | yes | yes | cellular | nov | ... | 2 | 999 | 1 | failure | -0.07 | 46.6000 | -29.40 | 5.193 | 5195.8 | 0 |
| 6 | customer_id_13094 | 34 | blue-collar | single | basic.4y | unknown | no | no | cellular | jul | ... | 2 | 999 | 0 | nonexistent | 1.43 | 46.9590 | -29.89 | 5.964 | 5228.1 | 0 |
| 7 | customer_id_36912 | 34 | technician | single | professional.course | no | no | no | cellular | jun | ... | 1 | 999 | 0 | nonexistent | -2.87 | 46.4815 | -28.56 | 2.217 | 5076.2 | 0 |
| 8 | customer_id_27834 | 39 | unemployed | single | university.degree | no | yes | no | cellular | mar | ... | 1 | 999 | 0 | nonexistent | -1.77 | 46.4215 | -35.00 | 2.642 | 5099.1 | 0 |
| 9 | customer_id_9302 | 40 | technician | single | professional.course | no | unknown | unknown | telephone | jun | ... | 3 | 999 | 0 | nonexistent | 1.43 | 47.2325 | -29.26 | 5.969 | 5228.1 | 0 |
10 rows × 22 columns
import cufflinks as cf
from pandas_profiling import ProfileReport
train.isnull()
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28826 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 28827 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 28828 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 28829 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 28830 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
28831 rows × 22 columns
train.isnull().sum()
customer_id 0 age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp_var_rate 0 cons_price_idx 0 cons_conf_idx 0 euribor3m 0 nr_employed 0 subscribed 0 dtype: int64
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 28831 entries, 0 to 28830 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 28831 non-null object 1 age 28831 non-null int64 2 job 28831 non-null object 3 marital 28831 non-null object 4 education 28831 non-null object 5 default 28831 non-null object 6 housing 28831 non-null object 7 loan 28831 non-null object 8 contact 28831 non-null object 9 month 28831 non-null object 10 day_of_week 28831 non-null object 11 duration 28831 non-null float64 12 campaign 28831 non-null int64 13 pdays 28831 non-null int64 14 previous 28831 non-null int64 15 poutcome 28831 non-null object 16 emp_var_rate 28831 non-null float64 17 cons_price_idx 28831 non-null float64 18 cons_conf_idx 28831 non-null float64 19 euribor3m 28831 non-null float64 20 nr_employed 28831 non-null float64 21 subscribed 28831 non-null int64 dtypes: float64(6), int64(5), object(11) memory usage: 4.8+ MB
train.describe()
| age | duration | campaign | pdays | previous | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 28831.000000 | 28831.000000 | 28831.000000 | 28831.000000 | 28831.000000 | 28831.000000 | 28831.000000 | 28831.000000 | 28831.000000 | 28831.000000 | 28831.000000 |
| mean | 42.011203 | 4.297919 | 2.575769 | 963.215844 | 0.172592 | 0.113202 | 46.788632 | -28.360564 | 4.623599 | 5167.011880 | 0.112761 |
| std | 10.450128 | 4.336882 | 2.752303 | 185.077567 | 0.494338 | 1.570978 | 0.289847 | 3.244405 | 1.735202 | 72.542598 | 0.316305 |
| min | 19.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | -3.370000 | 46.100500 | -35.560000 | 1.636000 | 4963.600000 | 0.000000 |
| 25% | 34.000000 | 1.700000 | 1.000000 | 999.000000 | 0.000000 | -1.770000 | 46.537500 | -29.890000 | 2.346000 | 5099.100000 | 0.000000 |
| 50% | 40.000000 | 3.000000 | 2.000000 | 999.000000 | 0.000000 | 1.130000 | 46.874500 | -29.260000 | 5.859000 | 5191.000000 | 0.000000 |
| 75% | 49.000000 | 5.300000 | 3.000000 | 999.000000 | 0.000000 | 1.430000 | 46.997000 | -25.480000 | 5.963000 | 5228.100000 | 0.000000 |
| max | 100.000000 | 81.966667 | 43.000000 | 999.000000 | 7.000000 | 1.430000 | 47.383500 | -18.830000 | 6.047000 | 5228.100000 | 1.000000 |
fig=py.figure(figsize=(10,8))
sns.set(style='darkgrid')
sns.heatmap(train.isnull(),yticklabels=False)
<AxesSubplot:>
train.profile_report()
train.columns
Index(['customer_id', 'age', 'job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration',
'campaign', 'pdays', 'previous', 'poutcome', 'emp_var_rate',
'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed',
'subscribed'],
dtype='object')
train.head()
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer_id_39075 | 31 | admin. | married | university.degree | no | no | no | cellular | dec | ... | 3 | 999 | 1 | failure | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | 0 |
| 1 | customer_id_34855 | 31 | technician | single | university.degree | no | no | no | telephone | may | ... | 4 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 2 | customer_id_7107 | 47 | blue-collar | married | basic.6y | unknown | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | 0 |
| 3 | customer_id_31614 | 36 | services | married | university.degree | no | no | no | cellular | may | ... | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | 0 |
| 4 | customer_id_34878 | 34 | admin. | single | high.school | no | no | no | cellular | may | ... | 9 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
5 rows × 22 columns
fig=py.figure(figsize=(10,8))
sns.boxplot(y='age',x='job',data=train)
py.tight_layout()
py.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), [Text(0, 0, 'admin.'), Text(1, 0, 'technician'), Text(2, 0, 'blue-collar'), Text(3, 0, 'services'), Text(4, 0, 'entrepreneur'), Text(5, 0, 'unemployed'), Text(6, 0, 'housemaid'), Text(7, 0, 'management'), Text(8, 0, 'unknown'), Text(9, 0, 'self-employed'), Text(10, 0, 'retired'), Text(11, 0, 'student')])
train['job'].unique()
array(['admin.', 'technician', 'blue-collar', 'services', 'entrepreneur',
'unemployed', 'housemaid', 'management', 'unknown',
'self-employed', 'retired', 'student'], dtype=object)
fig=py.figure(figsize=(10,8))
train[train['housing']=='yes']['cons_price_idx'].plot(kind='hist',bins=30,label='yes')
train[train['housing']=='no']['cons_price_idx'].plot(kind='hist',bins=30,label='no')
py.legend()
<matplotlib.legend.Legend at 0x1f381de35e0>
train.groupby(by='marital').count()['education'].plot()
<AxesSubplot:xlabel='marital'>
train.head()
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer_id_39075 | 31 | admin. | married | university.degree | no | no | no | cellular | dec | ... | 3 | 999 | 1 | failure | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | 0 |
| 1 | customer_id_34855 | 31 | technician | single | university.degree | no | no | no | telephone | may | ... | 4 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 2 | customer_id_7107 | 47 | blue-collar | married | basic.6y | unknown | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | 0 |
| 3 | customer_id_31614 | 36 | services | married | university.degree | no | no | no | cellular | may | ... | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | 0 |
| 4 | customer_id_34878 | 34 | admin. | single | high.school | no | no | no | cellular | may | ... | 9 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
5 rows × 22 columns
train
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer_id_39075 | 31 | admin. | married | university.degree | no | no | no | cellular | dec | ... | 3 | 999 | 1 | failure | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | 0 |
| 1 | customer_id_34855 | 31 | technician | single | university.degree | no | no | no | telephone | may | ... | 4 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 2 | customer_id_7107 | 47 | blue-collar | married | basic.6y | unknown | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | 0 |
| 3 | customer_id_31614 | 36 | services | married | university.degree | no | no | no | cellular | may | ... | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | 0 |
| 4 | customer_id_34878 | 34 | admin. | single | high.school | no | no | no | cellular | may | ... | 9 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28826 | customer_id_6265 | 60 | retired | married | professional.course | unknown | no | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.859 | 5191.0 | 0 |
| 28827 | customer_id_11284 | 39 | management | married | university.degree | no | no | no | telephone | jun | ... | 1 | 999 | 0 | nonexistent | 1.43 | 47.2325 | -29.26 | 5.963 | 5228.1 | 0 |
| 28828 | customer_id_38158 | 37 | admin. | married | high.school | no | yes | no | cellular | oct | ... | 1 | 4 | 1 | success | -3.37 | 46.2155 | -18.83 | 1.756 | 5017.5 | 1 |
| 28829 | customer_id_860 | 42 | management | married | university.degree | no | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.858 | 5191.0 | 0 |
| 28830 | customer_id_15795 | 31 | admin. | single | university.degree | no | yes | no | cellular | jul | ... | 2 | 999 | 0 | nonexistent | 1.43 | 46.9590 | -29.89 | 5.962 | 5228.1 | 0 |
28831 rows × 22 columns
test.head()
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | duration | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer_id_32884 | 59 | technician | married | high.school | no | no | yes | cellular | may | ... | 6.183333 | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.301 | 5099.1 |
| 1 | customer_id_3169 | 57 | unknown | married | unknown | unknown | yes | no | telephone | may | ... | 4.750000 | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 |
| 2 | customer_id_32206 | 35 | blue-collar | married | basic.9y | no | no | no | cellular | may | ... | 0.866667 | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.315 | 5099.1 |
| 3 | customer_id_9403 | 38 | admin. | married | high.school | no | no | no | telephone | jun | ... | 5.916667 | 4 | 999 | 0 | nonexistent | 1.43 | 47.2325 | -29.26 | 5.969 | 5228.1 |
| 4 | customer_id_14020 | 29 | housemaid | married | high.school | no | yes | no | cellular | jul | ... | 3.150000 | 2 | 999 | 0 | nonexistent | 1.43 | 46.9590 | -29.89 | 5.965 | 5228.1 |
5 rows × 21 columns
train_id=train.select_dtypes(include=['number'])
train_id
| age | duration | campaign | pdays | previous | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 31 | 1.283333 | 3 | 999 | 1 | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | 0 |
| 1 | 31 | 0.200000 | 4 | 999 | 0 | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 2 | 47 | 4.616667 | 2 | 999 | 0 | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | 0 |
| 3 | 36 | 1.166667 | 1 | 999 | 1 | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | 0 |
| 4 | 34 | 19.683333 | 9 | 999 | 0 | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28826 | 60 | 7.116667 | 2 | 999 | 0 | 1.13 | 46.9970 | -25.48 | 5.859 | 5191.0 | 0 |
| 28827 | 39 | 4.800000 | 1 | 999 | 0 | 1.43 | 47.2325 | -29.26 | 5.963 | 5228.1 | 0 |
| 28828 | 37 | 3.233333 | 1 | 4 | 1 | -3.37 | 46.2155 | -18.83 | 1.756 | 5017.5 | 1 |
| 28829 | 42 | 4.916667 | 2 | 999 | 0 | 1.13 | 46.9970 | -25.48 | 5.858 | 5191.0 | 0 |
| 28830 | 31 | 5.550000 | 2 | 999 | 0 | 1.43 | 46.9590 | -29.89 | 5.962 | 5228.1 | 0 |
28831 rows × 11 columns
from sklearn.linear_model import LogisticRegression
X=train_id.drop(target,axis=1)
y=train_id['subscribed']
from sklearn.ensemble import ExtraTreesClassifier
model=ExtraTreesClassifier()
model.fit(X,y)
ExtraTreesClassifier()
model.feature_importances_
array([0.15837812, 0.39500592, 0.07265032, 0.07162169, 0.02224527,
0.02634433, 0.02122306, 0.02618026, 0.15328945, 0.05306158])
cdf=pd.DataFrame(model.feature_importances_,index=X.columns,columns=['features'])
fig=py.figure(figsize=(15,10))
cdf.plot(kind='barh',figsize=(10,10))
<AxesSubplot:>
<Figure size 1080x720 with 0 Axes>
model_log=LogisticRegression(max_iter=500)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.42859, random_state=101)
model_log.fit(X_train,y_train)
LogisticRegression(max_iter=500)
predict=model_log.predict(X_test)
predict
array([0, 0, 0, ..., 0, 1, 0], dtype=int64)
from sklearn.metrics import classification_report , confusion_matrix,accuracy_score
print(classification_report(y_test,predict))
precision recall f1-score support
0 0.92 0.97 0.95 10929
1 0.65 0.35 0.46 1428
accuracy 0.90 12357
macro avg 0.78 0.66 0.70 12357
weighted avg 0.89 0.90 0.89 12357
print(confusion_matrix(y_test,predict))
[[10651 278] [ 922 506]]
accuracy_score(y_test,predict)
0.902889050740471
sns.heatmap(confusion_matrix(y_test,predict),annot=True,cmap='viridis')
<AxesSubplot:>
print(len(predict))
print('\n')
len(sample)
12357
12357
sample=pd.read_csv('sample_submission.csv')
sample.head()
| customer_id | subscribed | |
|---|---|---|
| 0 | customer_id_32884 | 1 |
| 1 | customer_id_3169 | 1 |
| 2 | customer_id_32206 | 1 |
| 3 | customer_id_9403 | 1 |
| 4 | customer_id_14020 | 1 |
len(sample)
12357
len(predict)
12357
sample=sample.copy()
sample['subscribed']=predict
sample.to_csv('firstai.csv',index=False)
pd.read_csv('firstai.csv')
| customer_id | subscribed | |
|---|---|---|
| 0 | customer_id_32884 | 0 |
| 1 | customer_id_3169 | 0 |
| 2 | customer_id_32206 | 0 |
| 3 | customer_id_9403 | 0 |
| 4 | customer_id_14020 | 0 |
| ... | ... | ... |
| 12352 | customer_id_15908 | 0 |
| 12353 | customer_id_28222 | 0 |
| 12354 | customer_id_14194 | 0 |
| 12355 | customer_id_19764 | 1 |
| 12356 | customer_id_26052 | 0 |
12357 rows × 2 columns
from sklearn.svm import SVC
model_sv=SVC()
model_sv.fit(X_train,y_train)
SVC()
predict_sv=model_sv.predict(X_test)
print(classification_report(y_test,predict_sv))
precision recall f1-score support
0 0.90 0.98 0.94 10929
1 0.63 0.20 0.30 1428
accuracy 0.89 12357
macro avg 0.76 0.59 0.62 12357
weighted avg 0.87 0.89 0.87 12357
print(confusion_matrix(y_test,predict))
[[10651 278] [ 922 506]]
accuracy_score(y_test,predict)
0.902889050740471
sample['subscribed']=predict_sv
sample.to_csv('seconda1.csv',index=False)
from sklearn.ensemble import RandomForestClassifier
model_random=RandomForestClassifier(n_estimators=200)
model_random.fit(X_train,y_train)
RandomForestClassifier(n_estimators=200)
model_predict=model_random.predict(X_test)
model_predict
array([0, 0, 0, ..., 0, 1, 0], dtype=int64)
print(confusion_matrix(y_test,model_predict))
print('\n')
print(classification_report(y_test,model_predict))
[[10538 391]
[ 718 710]]
precision recall f1-score support
0 0.94 0.96 0.95 10929
1 0.64 0.50 0.56 1428
accuracy 0.91 12357
macro avg 0.79 0.73 0.76 12357
weighted avg 0.90 0.91 0.91 12357
accuracy_score(y_test,predict)
0.902889050740471
sample['subscribed']=model_predict
sample.to_csv('thirdai.csv',index=False)
train
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer_id_39075 | 31 | admin. | married | university.degree | no | no | no | cellular | dec | ... | 3 | 999 | 1 | failure | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | 0 |
| 1 | customer_id_34855 | 31 | technician | single | university.degree | no | no | no | telephone | may | ... | 4 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 2 | customer_id_7107 | 47 | blue-collar | married | basic.6y | unknown | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | 0 |
| 3 | customer_id_31614 | 36 | services | married | university.degree | no | no | no | cellular | may | ... | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | 0 |
| 4 | customer_id_34878 | 34 | admin. | single | high.school | no | no | no | cellular | may | ... | 9 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28826 | customer_id_6265 | 60 | retired | married | professional.course | unknown | no | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.859 | 5191.0 | 0 |
| 28827 | customer_id_11284 | 39 | management | married | university.degree | no | no | no | telephone | jun | ... | 1 | 999 | 0 | nonexistent | 1.43 | 47.2325 | -29.26 | 5.963 | 5228.1 | 0 |
| 28828 | customer_id_38158 | 37 | admin. | married | high.school | no | yes | no | cellular | oct | ... | 1 | 4 | 1 | success | -3.37 | 46.2155 | -18.83 | 1.756 | 5017.5 | 1 |
| 28829 | customer_id_860 | 42 | management | married | university.degree | no | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.858 | 5191.0 | 0 |
| 28830 | customer_id_15795 | 31 | admin. | single | university.degree | no | yes | no | cellular | jul | ... | 2 | 999 | 0 | nonexistent | 1.43 | 46.9590 | -29.89 | 5.962 | 5228.1 | 0 |
28831 rows × 22 columns
train_1=pd.get_dummies(train,drop_first=True)
train_1.head()
| age | duration | campaign | pdays | previous | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | ... | month_may | month_nov | month_oct | month_sep | day_of_week_mon | day_of_week_thu | day_of_week_tue | day_of_week_wed | poutcome_nonexistent | poutcome_success | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 31 | 1.283333 | 3 | 999 | 1 | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 1 | 31 | 0.200000 | 4 | 999 | 0 | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 47 | 4.616667 | 2 | 999 | 0 | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3 | 36 | 1.166667 | 1 | 999 | 1 | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 34 | 19.683333 | 9 | 999 | 0 | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 28884 columns
X=train_1.drop(target,axis=1)
y=train_1['subscribed']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.42859, random_state=102)
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression(max_iter=1000)
cdf=train.drop('customer_id',axis=1)
train_model=train.drop(['customer_id'],axis=1)
train_model=pd.get_dummies(train_model)
X=train_model.drop(target,axis=1)
y=train_model['subscribed']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.42859, random_state=102)
train
| customer_id | age | job | marital | education | default | housing | loan | contact | month | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer_id_39075 | 31 | admin. | married | university.degree | no | no | no | cellular | dec | ... | 3 | 999 | 1 | failure | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | 0 |
| 1 | customer_id_34855 | 31 | technician | single | university.degree | no | no | no | telephone | may | ... | 4 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 2 | customer_id_7107 | 47 | blue-collar | married | basic.6y | unknown | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | 0 |
| 3 | customer_id_31614 | 36 | services | married | university.degree | no | no | no | cellular | may | ... | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | 0 |
| 4 | customer_id_34878 | 34 | admin. | single | high.school | no | no | no | cellular | may | ... | 9 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28826 | customer_id_6265 | 60 | retired | married | professional.course | unknown | no | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.859 | 5191.0 | 0 |
| 28827 | customer_id_11284 | 39 | management | married | university.degree | no | no | no | telephone | jun | ... | 1 | 999 | 0 | nonexistent | 1.43 | 47.2325 | -29.26 | 5.963 | 5228.1 | 0 |
| 28828 | customer_id_38158 | 37 | admin. | married | high.school | no | yes | no | cellular | oct | ... | 1 | 4 | 1 | success | -3.37 | 46.2155 | -18.83 | 1.756 | 5017.5 | 1 |
| 28829 | customer_id_860 | 42 | management | married | university.degree | no | yes | no | telephone | may | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.858 | 5191.0 | 0 |
| 28830 | customer_id_15795 | 31 | admin. | single | university.degree | no | yes | no | cellular | jul | ... | 2 | 999 | 0 | nonexistent | 1.43 | 46.9590 | -29.89 | 5.962 | 5228.1 | 0 |
28831 rows × 22 columns
cdf.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 31 | admin. | married | university.degree | no | no | no | cellular | dec | mon | ... | 3 | 999 | 1 | failure | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | 0 |
| 1 | 31 | technician | single | university.degree | no | no | no | telephone | may | fri | ... | 4 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
| 2 | 47 | blue-collar | married | basic.6y | unknown | yes | no | telephone | may | thu | ... | 2 | 999 | 0 | nonexistent | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | 0 |
| 3 | 36 | services | married | university.degree | no | no | no | cellular | may | thu | ... | 1 | 999 | 1 | failure | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | 0 |
| 4 | 34 | admin. | single | high.school | no | no | no | cellular | may | fri | ... | 9 | 999 | 0 | nonexistent | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | 0 |
5 rows × 21 columns
cdf=pd.get_dummies(cdf,drop_first=1)
cdf.head()
| age | duration | campaign | pdays | previous | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | ... | month_may | month_nov | month_oct | month_sep | day_of_week_mon | day_of_week_thu | day_of_week_tue | day_of_week_wed | poutcome_nonexistent | poutcome_success | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 31 | 1.283333 | 3 | 999 | 1 | -2.97 | 46.3565 | -23.10 | 1.711 | 5023.5 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 1 | 31 | 0.200000 | 4 | 999 | 0 | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 47 | 4.616667 | 2 | 999 | 0 | 1.13 | 46.9970 | -25.48 | 5.862 | 5191.0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3 | 36 | 1.166667 | 1 | 999 | 1 | -1.77 | 46.4465 | -32.34 | 2.329 | 5099.1 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 34 | 19.683333 | 9 | 999 | 0 | -1.77 | 46.4465 | -32.34 | 2.252 | 5099.1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 54 columns
log_reg.fit(X_train,y_train)
LogisticRegression(max_iter=1000)
logpredict=log_reg.predict(X_test)
logpredict
array([0, 1, 0, ..., 0, 0, 0], dtype=int64)
len(logpredict)
12357
print(classification_report(y_test,logpredict))
print('\n')
print(confusion_matrix(y_test,logpredict))
precision recall f1-score support
0 0.93 0.97 0.95 10970
1 0.66 0.42 0.51 1387
accuracy 0.91 12357
macro avg 0.79 0.70 0.73 12357
weighted avg 0.90 0.91 0.90 12357
[[10667 303]
[ 805 582]]
accuracy_score(y_test,logpredict)
sample=sample.copy()
sample['subscribed']=logpredict
sample
| customer_id | subscribed | |
|---|---|---|
| 0 | customer_id_32884 | 0 |
| 1 | customer_id_3169 | 1 |
| 2 | customer_id_32206 | 0 |
| 3 | customer_id_9403 | 0 |
| 4 | customer_id_14020 | 0 |
| ... | ... | ... |
| 12352 | customer_id_15908 | 0 |
| 12353 | customer_id_28222 | 0 |
| 12354 | customer_id_14194 | 0 |
| 12355 | customer_id_19764 | 0 |
| 12356 | customer_id_26052 | 0 |
12357 rows × 2 columns
sample.to_csv('fourthai.csv',index=False)
from sklearn.ensemble import RandomForestClassifier
X=cdf.drop(target,axis=1)
y=cdf['subscribed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.569899, random_state=102)
rdf=RandomForestClassifier()
rdf.fit(X_train,y_train)
RandomForestClassifier()
rdf_predict=rdf.predict(X_train)
len(rdf_predict)
12400
rd=pd.DataFrame(rdf_predict,columns=target)
rdf=rd[:len(sample)]
sample=sample.copy()
sample['subscribed']=rdf
sample.to_csv('sixthai.csv',index=False)
rdf
| subscribed | |
|---|---|
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 1 |
| ... | ... |
| 12352 | 0 |
| 12353 | 0 |
| 12354 | 0 |
| 12355 | 0 |
| 12356 | 1 |
12357 rows × 1 columns